clear
set more off

* ==================== START START START ==================== *

* ------------------------------------------------------------------------------
* set paths
[...]
* ------------------------------------------------------------------------------
clear
set more off
* ==============================================================================


* 							 Preliminaries


* ============================================================================== 

* ------------------------------------------------------------------------------
* READ IN AND CLEAN PARCEL-TO-CENSUS BLOCK MATCHES
* ------------------------------------------------------------------------------

* keeping for posterity
// https://www2.census.gov/geo/pdfs/maps-data/data/tiger/tgrshp2010/TGRSHP10SF1.pdf

*____________________________________________
* read in and clean 1990 parcel to block data
import delimited using "$inputGIS/parcel_block_1990", stringcols(_all) clear
keep parcel stfid
replace parcel = subinstr(parcel,".000000","",1) // ArcGIS added; unnecessary
gen length = strlen(parcel)
replace parcel = "00" + parcel if length==9
replace parcel = "0" + parcel if length==10
drop length

duplicates tag parcel, gen(dup90)
// this occurs if parcel point intersects with the census block boundary
sort parcel
export delimited parcel stfid using "$output/duplicates_1990.csv" if dup90>0, q replace
export delimited parcel stfid using "$output/NULL_1990.csv" if stfid=="NULL", q replace
duplicates drop parcel, force

encode stfid, generate(blk90)
sort parcel
la da "temporary data: parcel to 1990 census block data file"
save "$intermediate/parcelblock_1990.dta", replace

*____________________________________________
* read in and clean 2000 parcel to block data
import delimited using "$inputGIS/parcel_block_2000", stringcols(_all) clear
keep parcel blkidfp00
replace parcel = subinstr(parcel,".000000","",1) // ArcGIS added; unnecessary
gen length = strlen(parcel)
replace parcel = "00" + parcel if length==9
replace parcel = "0" + parcel if length==10
drop length

duplicates tag parcel, gen(dup00) 
// this occurs if parcel point intersects with the census block boundary
sort parcel
export delimited parcel blkidfp00 using "$output/duplicates_2000.csv" if dup00>0, q replace
export delimited parcel blkidfp00 using "$output/NULL_2000.csv" if blkidfp00=="NULL", q replace
duplicates drop parcel, force

encode blkidfp00, generate(blk00)
sort parcel
la da "temporary data: parcel to 2000 census block data file"
save "$intermediate/parcelblock_2000.dta", replace

*____________________________________________
* read in and clean 2010 parcel to block data
import delimited using "$inputGIS/parcel_block_2010", stringcols(_all) clear
keep parcel geoid10
replace parcel = subinstr(parcel,".000000","",1) // ArcGIS added; unnecessary
gen length = strlen(parcel)
replace parcel = "00" + parcel if length==9
replace parcel = "0" + parcel if length==10
drop length

duplicates tag parcel, gen(dup10) 
// this occurs if parcel point intersects with the census block boundary
sort parcel
export delimited parcel geoid10 using "$output/duplicates_2010.csv" if dup10>0, q replace
export delimited parcel geoid10 using "$output/NULL_2010.csv" if geoid10=="NULL", q replace
duplicates drop parcel, force

encode geoid10, generate(blk10)
sort parcel
la da "temporary data: parcel to 2010 census block data file"
save "$intermediate/parcelblock_2010.dta", replace

*____________________________________________
* merge parcel to census block files
use "$intermediate/parcelblock_1990.dta", clear
merge 1:1 parcel using "$intermediate/parcelblock_2000.dta"
drop _merge

merge 1:1 parcel using "$intermediate/parcelblock_2010.dta"

replace blk90 = . if stfid=="NULL" // parcels not assigned to a blk should be .
replace blk00 = . if blkidfp00=="NULL"
replace blk10 = . if geoid10=="NULL"

keep parcel blk90 blk00 blk10
order parcel blk90 blk00 blk10
compress
sort parcel
la da "parcel to census block matching ('90, '00, '10 blocks)"
save "$intermediate/parcelblk.dta", replace

rm "$intermediate/parcelblock_1990.dta"
rm "$intermediate/parcelblock_2000.dta"
rm "$intermediate/parcelblock_2010.dta"


* ------------------------------------------------------------------------------
* read in housing CPI data
* ------------------------------------------------------------------------------

import excel using "$input/CPI.xlsx", first case(preserve) clear
rename sale_yr yros
sort yros
la da "CPI housing index relative to 2014"
save "$intermediate/CPI.dta", replace // CPI housing index relative to 2014

* ------------------------------------------------------------------------------
* read in and clean cash-for-grass enrollment data
* ------------------------------------------------------------------------------

use "$input/1tblWSLEnroll.dta", clear
rename _all, lower

* floor all enroll_dates
gen enroll_date2 = floor(enroll_date)
gen dif_day = day(enroll_date) - day(enroll_date2)
gen dif_month = month(enroll_date) - month(enroll_date2)
gen dif_year = year(enroll_date) - year(enroll_date2)

* drop original enroll_date and rename the floored enroll_date2, order and sort data
drop enroll_date dif_*
rename enroll_date2 enroll_date
format enroll_date %tdnn/dd/CCYY
order parcel enroll_date
sort parcel enroll_date

* tag parcels that convert more than once
duplicates tag parcel, gen(dupEnroll)
duplicates tag parcel if program!="XS", gen(dupEnroll_notXS)
la var dupEnroll "parcel converts n+1 times"
la var dupEnroll_notXS "parcel converts n+1 times outside of the pilot study"

* tag parcels that are enrolled in multiple conversions on the same day
duplicates tag parcel enroll_date, gen(dup_sameday)
la var dup_sameday "parcel enrolled n+1 times on this date"

// Consider multi-day enrollments as one enrollment
bys parcel enroll_date: egen conv_sqft2 = total(conv_sqft) if dup_sameday > 0
bys parcel enroll_date: egen tot_rebated2 = total(tot_rebated) if dup_sameday > 0

duplicates drop parcel enroll_date if dup_sameday > 0, force
replace conv_sqft = conv_sqft2 if dup_sameday > 0
replace tot_rebated = tot_rebated2 if dup_sameday > 0
drop conv_sqft2 tot_rebated2

sort wg parcel enroll_date

* sort and save data
sort parcel enroll_date
la da "WSL enrollment data sorted and same day SF duplicates addressed and dropped"
save "$intermediate/1tblWSLEnroll_sort.dta", replace

* ------------------------------------------------------------------------------
* read in and clean adjacency data from GIS
* ------------------------------------------------------------------------------
import delimited using "$inputGIS/sj2_adjacency.txt", stringcols(_all) clear
drop objectid join_count* target_fid* join_fid* calc_acres* assr_acres* 
drop txt_angle* tax_dist* shape_* label_clas* parceltype* pt_*
drop conversion multi zip

* drop trailing zeros in parcel
replace parcel = subinstr(parcel,".000000000000000","",1) 
// ArcGIS added; unnecessary
compress parcel

* drop trailing time in enroll_date
replace enroll_date = subinstr(enroll_date," 0:00:00","",1) 
// ArcGIS added; unnecessary
compress enroll_date

* destring numeric varibles
destring conv_sqft tot_rebated, replace

* drop self-adjacencies
drop if apn==apn_1 & apn==parcel

* drop "townhomes";
* cases where a single building is given multiple parcels, but only one converted
preserve
keep if apn_1 != parcel
sort parcel
save "$intermediate/temp_apn1_notInLVVWD2.dta", replace
// none of these are in AOdata_inLVVWD2.dta
restore
drop if apn_1 != parcel // now, apn_1==parcel
drop apn_1

* label variables
la var apn "parcel is adjacent to a converting parcel"
la var parcel "converting parcel" 

* destring enroll_date
rename enroll_date enroll_dateSTR
gen enroll_date = date(enroll_dateSTR,"MDY")
format enroll_date %tdnn/dd/CCYY
drop enroll_dateSTR
order apn parcel enroll_date

* Treat same-day enrollments as a single enrollment of a larger conversion.
//  add up area and rebate monies for the same-day enrollments and then drop duplicate same-day enrollments
duplicates tag apn parcel enroll_date, gen(dup_sameday)
la var dup_sameday "n+1 enrollments for the neighboring parcel occurs on enroll_date"

bys apn parcel enroll_date: egen conv_sqft2 = total(conv_sqft) if dup_sameday==1
bys apn parcel enroll_date: egen tot_rebated2 = total(tot_rebated) if dup_sameday==1

replace conv_sqft = conv_sqft2 if dup_sameday==1
replace tot_rebated = tot_rebated2 if dup_sameday==1

drop tot_rebated2 conv_sqft2
duplicates drop apn parcel enroll_date, force

* check agreement between original enrollment file and the enrollments contained here
rename (conv_sqft tot_rebated wg dup_sameday program) (conv_sqft2 tot_rebated2 wg2 dup_sameday2 program2)
sort parcel enroll_date
merge m:1 parcel enroll_date using "$intermediate/1tblWSLEnroll_sort.dta"
sort apn enroll_date parcel

* replace adjacency area and rebate values with original area and rebate values,
* and then restore adjacency data
keep if _merge == 3 
drop _merge

replace conv_sqft2 = conv_sqft if conv_sqft2 != conv_sqft
replace tot_rebated2 = tot_rebated if tot_rebated2 != tot_rebated

// * and *2 variables now contain the same information
drop conv_sqft2 tot_rebated2 wg2 dup_sameday2 program2 dupEnroll* dup_sameday

* consider parcel that neighbor participating parcels enrolling on the same day as one enrollment
duplicates tag apn enroll_date, gen(adjcnt)
replace adjcnt = adjcnt + 1
la var adjcnt "number of adjacent parcels that converted on the enroll_date"

bys apn enroll_date: egen conv_sqft2 = total(conv_sqft)
bys apn enroll_date: egen tot_rebated2 = total(tot_rebated)
replace conv_sqft = conv_sqft2 if adjcnt>1
replace tot_rebated = tot_rebated2 if adjcnt>1
drop tot_rebated2 conv_sqft2

duplicates drop apn enroll_date, force
rename parcel neighbor
la var neighbor "participating parcel adjacent to parcel: meaningless for adjcnt > 1"
rename apn parcel

* order sort and save neighbors data
drop wg program // all observations are "SF" and I don't really care about program type
order parcel neighbor enroll_date conv_sqft tot_rebated adjcnt
sort parcel enroll_date
la da "parcels adjacent to converting parcels"
save "$intermediate/neighbors.dta", replace

clear
set more off
* ==============================================================================


*				 		Clean Raw County Assessor Data


* ==============================================================================


* ------------------------------------------------------------------------------
* Clean AO extract FILES and create 'sales' and 'attributes' data files
* ------------------------------------------------------------------------------
import delim using "$inputAO/JMODaoextract.txt", delim("|") stringc(_all) varn(1) clear
sort parcel

foreach var of varlist _all {
quietly replace `var' = strtrim(`var')
}

rename taxdist tax
la var tax "tax district code"

* drop unnecessary variables
drop etalflag owner* mail_* legal_* loc_* ly* doc* exmpt* nbrhood *val
drop common_name ad* sectno township range subname comments abatement_type
drop assdyr fiscalyear // both 2017 for all values

* string to numeric format
replace poolcnt = "0" if poolcnt == ""
destring capacity constyr saleprice landacres poolcnt, replace
gen landsf = landacres * 43560 // 43560 sq-ft in 1 acre
la var landsf "area in sq-ft: landacres x 43,560 sq-ft"

* generate date variable
gen dos = date(saledate,"YM")
format dos %tdnn/dd/CCYY
la var dos "date of sale"

* save parcel attribute data from ao extract
preserve // ** PRESERVE **
drop saleprice saletype saledate dos
compress
order parcel landuse capacity constyr landacres landsf poolcnt tax
sort parcel
la da "AO Extract: parcel attributes"
save "$intermediate/ao_extract_attribute.dta", replace
restore // ** RESTORE **

* save parcel sale data (most recent) from ao extract
rename saleprice price
keep parcel price saletype saledate dos
order parcel saledate dos price saletype 
compress
sort parcel
la da "AO Extract: most recent parcel sale information"
save "$intermediate/ao_extract_sale.dta", replace

* ------------------------------------------------------------------------------
* Clean AO residential extract
* ------------------------------------------------------------------------------
import delim using "$inputAO/JMODaores.txt", delim("|") stringc(_all) varn(1) clear
rename parcelnumber parcel
sort parcel

* Explore duplicates on all variables
duplicates tag, gen(dup)
duplicates drop
drop dup

* trim string variables (e.g. "RES " to "RES")
foreach var of varlist _all {
quietly replace `var' = strtrim(`var')
}
* 'parcel keysuffix keysuffixn' should uniquely identify a parcel in AORes 
duplicates tag parcel keysuffix keysuffixn, gen(iddup)
la var iddup "duplicates from aores.txt on parcel keysuffix keysuffixn"

duplicates tag parcel keysuffix keysuffixn roof if iddup==1, gen(duproof)
la var duproof "duplicates from aores.txt on parcel keysuffix keysuffixn roof if iddup==1"

duplicates drop parcel keysuffix keysuffixn, force
destring keysuffix keysuffixn, replace

* sort and save ao_res file
sort parcel keysuffix keysuffixn
la da "cleaned up AO Res file from Clk Cnty AO"
save "$intermediate/temp_ao_res.dta", replace

* ------------------------------------------------------------------------------
* Clean AO residential extract (extra features) and merge with ao_res.dta above
* ------------------------------------------------------------------------------
import delim using "$inputAO/JMODaores_extra_features.txt", delim("|") stringc(_all) varn(1) clear
rename parcelnumber parcel
sort parcel

* trim variables
foreach var of varlist _all {
replace `var' = strtrim(`var')
}

* destring variables
destring miscunits keysuffix keysuffixn, replace

* keep pool information 
keep if misccode=="RPL1"|misccode=="RPL2"|misccode=="RPL3"|misccode=="RPL4"|misccode=="RPL5"|misccode=="RPLC"|misccode=="RPSP" 

* drop descriptor variables (these are the same as misccode)
drop miscsdescr miscldescr

* check for and address duplicates
duplicates tag parcel keysuffix keysuffixn misccode, gen(dup)

* add units by duplicates on parcel key* and misccode
*   ^ ASSUMPTION ^: if a pool is listed twice, it is not a mistake; rather it means there are two pools
bysort parcel keysuffix keysuffixn misccode: egen temp_units = total(miscunits)
replace miscunits = temp_units if dup == 1
duplicates drop parcel keysuffix keysuffixn misccode, force

replace miscunits = 1 if parcel == "[PARCEL CODE]" & misccode == "RPL2"
// visual inspection suggests 1 pool only

drop dup temp_units

* reshape data and replace subsequent missing values
sort parcel keysuffix keysuffixn
reshape wide miscunits, i(parcel keysuffix keysuffixn) j(misccode) string

foreach var of varlist miscunits* {
replace `var' = 0 if `var' == .
}

* generate aggregate pool variables
rename miscunits* *
gen poolsf = RPL1*300 + RPL2*450 + RPL3*512 + RPL4*648 + RPL5*800 + RPLC
drop RPL1 RPL2 RPL3 RPL4 RPL5 RPLC

rename RPSP spa

* merge with residential data file
sort parcel keysuffix keysuffixn
merge 1:1 parcel keysuffix keysuffixn using "$intermediate/temp_ao_res.dta" 
// no unmatched observations from res_extra
drop _merge

* assume any missing pool and spa information indicates no spa nor pool
replace spa = 0 if spa == .
replace poolsf = 0 if poolsf == .

* sort, label and save data
sort parcel keysuffix keysuffixn
la da "temporary residential file: ao_res merged with ao_res_extra_features"
save "$intermediate/temp_resExtra.dta", replace

* ------------------------------------------------------------------------------
* Reduce residential data file to be unique on parcels
* ------------------------------------------------------------------------------
use "$intermediate/temp_resExtra.dta", clear

* tag duplicates on parcel and drop parcels that are duplicated more than once
duplicates tag parcel, gen(dupp)
la var dupp "duplicates on parcel from merged aores and res_extra temp file"

drop if dupp > 1 // drops less than 2% of my residential data

*____________________________________________
* make a file that keeps all detached garage observations (dupp==1, typestories==18)
preserve // PRESERVE
keep if typestories == "18" & dupp==1
display _N
duplicates tag parcel, gen(dup18)
duplicates drop parcel, force

keep parcel grg* qualityclass yeabuilt effecyeabuilt dup18
rename (grg* qualityclass yeabuilt effecyeabuilt) (DGgrg* DGqualityclass DGyeabuilt DGeffecyeabuilt)
duplicates report parcel // file is unique on parcel
sort parcel
la da "dupp=1 typestories=18: detached garages"
save "$intermediate/tempDG.dta", replace
restore // RESTORE
*____________________________________________

* merge tempDG with cleaned up file
drop if typestories == "18" & dupp==1 
// these dropped observations comprise the data set I created above
duplicates tag parcel, gen(dupp2) 
// this will tag duplicate (on parcel) observations that do not have attached garages
drop if dupp==1 & dupp2 == 1 
// this drops those parcels that are duplicated, but do not have detached garages

sort parcel
merge 1:1 parcel using "$intermediate/tempDG.dta" // 12 of these should not match from using
drop if _merge==2 // this drops those parcels that are just garages

* create a flag for parcels that have a detached garage
gen extragar = 0
replace extragar = 1 if _merge==3
la var extragar "indicator for an extra detached garage (parcel was dupp=1 with a type=18)"

* finalize, sort, and save temporary residential file
drop _merge dup18 dupp2
sort parcel 
la da "temporary residential file: residential file unique on parcels"
save "$intermediate/temp_resUniqueParcel.dta", replace

* ------------------------------------------------------------------------------
* Final steps for creating the cleaned residential data file
* ------------------------------------------------------------------------------
use "$intermediate/temp_resUniqueParcel.dta", clear

* tab and drop variables I will not use in the hedonic regression
local vrbls "subfloor exteriowalls roof builtins intercom trashcompactor bimicrowave birefrigerator vacuumcleaner hardwiredsecurity basementoutside"
foreach var of local vrbls {
display "tab `var'"
tab `var'
display "Missing observations: "_N - r(N)
display ""
}
drop `vrbls'

* rename long variable names
rename (yeabuilt effecyeabuilt DGyeabuilt DGeffecyeabuilt) (built ebuilt DGbuilt DGebuilt)
rename (qualityclass DGqualityclass typestories heatingtype coolingtype) (quality DGquality ptype heat cool)
rename (totalrooms bedrooms familyden formaldini bathroomsfull bathroomshalf) (totrm bed famden dine fbath hbath)
rename (plumbingfixtures fireplacehirankcount fireplacelorankcount) (fixture firehi firelo)
rename (floor* floomarblegranite) (* marblegranite)
rename (*sqft) (*sf)
rename (firstfloorsf secondfloorsf abovesecondsf basementsf basementfinsf convaddsf) (f1sf f2sf fgt2sf f0sf f0finsf addsf)

* convert numeric data from strings to numbers and replace missing with zero
local numVars "keysuffix keysuffixn built ebuilt DGbuilt DGebuilt totrm bed fbath hbath famden dine fixture firehi firelo hardwood carpet vinyl ceramic marblegranite tavertine concrete f0sf f0finsf f1sf f2sf fgt2sf addsf grgsf DGgrgsf carportsf"
foreach var of local numVars {
destring `var', replace
replace `var' = 0 if `var'==.
}
* Create variable for total home size and drop observations with zero home size
gen housesf = f0finsf + f1sf + f2sf + fgt2sf + addsf
la var housesf "f0finsf + f1sf + f2sf + fgt2sf + addsf"
drop if housesf == 0

* create addition indicators based on convaddsf variable
gen addition = 0
replace addition = 1 if addsf>0
la var addition "indicator for the presence of an addition: convaddsqft > 0"

* create addition indicator based on difference between year built and effective year built
gen addition2 = 0
la var addition2 "year built != effective year built"
replace addition2 = 1 if ebuilt != built // alternative method for determining if a house underwent an addition

gen delta_eb = ebuilt - built
replace delta_eb = 999 if delta_eb >=11 // to save space in the tabulation results (999 is >= 11 years)
la var delta_eb "(effective year built) - (year built)"

gen addition3 = 0
la var addition3 "if extragar=1 and addition2=0: 1 if DGbuilt OR DGebuilt != built"
replace addition3 = 1 if (DGbuilt != built | DGebuilt!= built) & addition2==0 & extragar == 1

* drop floor specific square footage variables
drop f0sf f0finsf f1sf f2sf fgt2sf addsf

sort parcel
la da "AO Res: res merged with extra features, unique on parcels"
save "$intermediate/ao_res.dta", replace

* ------------------------------------------------------------------------------
** Clean AO sales
* ------------------------------------------------------------------------------
import delim using "$inputAO/JMODaosales.txt", delim("|") stringc(_all) varn(1) clear

* trim string variables, destring price and compress data
foreach var of varlist _all {
quietly replace `var' = strtrim(`var')
}
destring price, replace
rename propertycondition pc
la var pc "property condition: vacant, improved, or no data"
drop grant*
compress

save "$intermediate/tempaos.dta", replace // for testing...can remove when finished

* ------------------------------------------------------------------------------
** APPEND EXTRACT AND REPEAT SALE DATA and keep only those sales from 
**   extract that are not in ao sales
* ------------------------------------------------------------------------------

use "$intermediate/tempaos.dta", clear // so that I don't have to keep reloading the text file
sort parcel or_bk or_pg
append using "$intermediate/ao_extract_sale.dta", gen(fromx)
drop if price == . & fromx == 1 // drop any parcels from EXTRACT with missing sale information
replace pc = "x" if pc == ""
sort parcel price or_bk or_pg
la var fromx "sale observation from ao extract"

* generate time and date of sale
gen time = substr(saledate,11,9) if fromx == 0
replace saledate = substr(saledate,1,10) if fromx == 0
replace saledate = subinstr(saledate,"-","",2) if fromx == 0
compress saledate
drop time

* generate sale date variable
gen useSD = 0 if fromx == 0 // indicator for whether saledate is used over or_bk
gen saledate2 = date(saledate,"YMD") if fromx == 0
gen or_bk2 = date(or_bk,"YMD") if fromx == 0
format saledate2 or_bk2 %tdnn/dd/CCYY

* replace dos with book sale date unless book sale date is weird 
replace dos = or_bk2 if or_bk2 !=. & fromx == 0 
// replace dos with book sale date for non-missing book sale dates

replace dos = saledate2 if or_bk2==. & fromx==0 
// replace the missing and weird or_bk dates with "saledate" dates
replace dos = saledate2 if year(or_bk2) < 1900 
// replace the one instance where year = 216
replace useSD = 1 if or_bk2==. & fromx==0
replace useSD = 1 if year(or_bk2) < 1900
la var useSD "1 if saledate is used instead of or_bk"
sort parcel dos or_pg

* check for duplicates among extract and sales data
gen year = year(dos)
gen month = month(dos)
order parcel dos year month price or_bk or_pg saledate pc salerating saletype
duplicates tag parcel year month price saletype, gen(dup) 
// these are the only common variables between AO Sales and AO Extract
drop if dup>0 & fromx==1 // I wish I had the exact date, but I believe this is the best I can assess if AO Extract has the same info
drop dup

sort parcel dos or_pg


* ------------------------------------------------------------------------------
** Finalize AO Sales
* ------------------------------------------------------------------------------

* Keep all "R" saletypes
keep if saletype == "R" // keeps about 65% of my data

* Keep all "Q" sales, as well as the
keep if salerating == "Q" | fromx == 1

replace salerating = "x" if fromx==1
drop saletype salerating

* keep sales between jan. 1, 1996 and June 12, 2014
gen test = 1
replace test = 0 if inrange(dos,date("1/1/1996","MDY"),date("6/12/2014","MDY"))
gen test2 = 0
replace test2 = 1 if dos > date("6/12/2014","MDY")
replace test2 = 2 if dos < date("1/1/1996","MDY")

keep if test == 0 // keep all sales occuring within 1/1/96 and 6/12/14
drop test test2

* address duplicates by parcel & dos
sort parcel dos or_pg
duplicates tag parcel dos, gen(dupdos)
duplicates tag parcel dos price, gen(dupdosp)

// I assume something funny is occuring for these duplicate observations. thus...
keep if dupdos == 0
duplicates tag parcel or_bk, gen(dup2)

* drop unneeded variables
drop dup* or_bk or_bk2 or_pg saledate saledate2 year month

* create repeat sales flag
duplicates tag parcel, gen(saletot)
replace saletot = saletot + 1
la var saletot "total number of instances the parcel sold"

sort parcel dos
by parcel: gen salecnt = _n
la var salecnt "index for i-th sale of a parcel"

gen resale = 0
replace resale = 1 if saletot>1
la var resale "indicator for parcels selling more than once"

* compress, sort, and save data and close log file
compress
order parcel dos price pc useSD
sort parcel dos
la da "AO Sales: 1/1/96-6/12/14 RQ sales, drop same day sales, price unadjusted"
save "$intermediate/ao_sales.dta", replace

clear
set more off
* ==============================================================================


*	 						Create hedonic panels


* ==============================================================================


* ------------------------------------------------------------------------------
** Merge AO data
* ------------------------------------------------------------------------------

* merge residential data with attribute data, and keep single family types (landuse 110)
use "$intermediate/ao_res.dta", clear
merge 1:1 parcel using "$intermediate/ao_extract_attribute.dta"

* keep matched observations with single family landuse classification
keep if _merge == 3 & landuse=="110"
drop _merge

* comparing pool data from res with pool data from extract
gen poolsfgt0 = 0
replace poolsfgt0 = 1 if poolsf>0
replace poolsf = . if poolsf == 0 & poolcnt > 0
drop poolsfgt0

* comparing year built (ao res) with construction year (extract)
gen bflg = 0
replace bflg = 1 if built != constyr
la var bflg "1 if year built (built) DNE construction year (constyr)"

* merge data with ao sales and keep matched observations
sort parcel
merge 1:m parcel using "$intermediate/ao_sales.dta"
  // 74,111 observations have no associated sales information
  // 189,038 observations have no associated parcel information
keep if _merge==3
drop _merge

* drop unneeded variables
drop landuse capacity recordtype keysuffix*

* replace detattched garage construction variables with missing if the extra garage flag is 0
replace DGbuilt = . if extragar==0
replace DGebuilt = . if extragar==0

* augment garage square footage variable to reflect the extra garage if year built is after date of sale 
gen garsf_1 = grgsf
replace garsf_1 = grgsf + DGgrgsf if extragar == 1 & DGbuilt <= year(dos) 
gen garsf_2 = grgsf
replace garsf_2 = grgsf + DGgrgsf if extragar == 1 & DGebuilt <= year(dos)
la var garsf_1 "augmented garage area based on sale year in/after garage build year"
la var garsf_2 "augmented garage area based on sale year in/after garage effective build year"

* sort and save master AO dataset
order parcel dos price constyr built ebuilt DGbuilt DGebuilt  land* housesf bed fbath hbath pool* grgsf quality DGgrgsf DGquality 
sort parcel dos
la da "Clark county AO data (from assessor)"
save "$intermediate/AOdata.dta", replace

* ------------------------------------------------------------------------------
* keep only parcels within LVVWD service district boundary
* ------------------------------------------------------------------------------

use "$input/1tblSFR_Pcl.dta", clear // could also use 1tblSFR_PclSvc.dta (see "Preliminaries")
rename _all, lower

keep parcel lot_size const_yr sale_date sale_price
sort parcel
merge 1:m parcel using "$intermediate/AOdata.dta"
sort parcel dos
gen test = const_yr - constyr

* keep only those parcels within the LVVWD service district and drop unneeded variables
keep if _merge==3
drop lot_size const_yr sale_price sale_date test _merge

sort parcel dos
la da "AO data within LVVWD service district"
save "$intermediate/AOdata_inLVVWD.dta", replace


* ------------------------------------------------------------------------------
* create additional variables needed for analysis
* ------------------------------------------------------------------------------
use "$intermediate/AOdata_inLVVWD.dta", clear

*_______________________________________________________________________________
* drop missing variables or variables that equal 0
duplicates report parcel dos // no duplicates!
drop if landsf == 0

*_______________________________________________________________________________
* create age variable
gen yros = year(dos) // year of sale
label var yros "year of house sale"

gen age = yros - built
la var age "age of the house (years)"

gen nage = 0
replace nage = -1 if age < 0
la var nage "-1 if age is less than zero"

drop if age < 0 // assuming most are sales of vacant land
drop nage

*_______________________________________________________________________________
* convert sale price to $2014 and create new price variables
sort parcel dos yros
merge m:1 yros using "$intermediate/CPI.dta"
gen yrtest = 0
replace yrtest = 1 if yros < 1996
keep if _merge==3 
drop _merge yrtest

replace price = price * CPI
la var price "CPI adjusted price ($2014)"

* create new price and age variables
gen p100k = price/100000 // house sale price in $100,000
gen annp7 = price * (0.07/(1+0.07)) // house sale price adjusted to rental value (7%)
gen annp5 = price * (0.05/(1+0.05)) // house sale price adjusted to rental value (5%)
gen lnp = ln(price) 
gen lnp7 = ln(annp7)
gen lnp5 = ln(annp5)

label var p100k "2014 CPI adjusted sale price ($100k)"
label var lnp "ln(2014 CPI adjusted sale price)"
label var lnp7 "ln(annualized (7%) 2014 CPI adjusted sale price)"
label var lnp5 "ln(annualized (5%) 2014 CPI adjusted sale price)"

*_______________________________________________________________________________
* merge with census block data
sort parcel dos
merge m:1 parcel using "$intermediate/parcelblk.dta"
keep if _merge==3
drop _merge

*_______________________________________________________________________________
* create an ID that will index the sales number (used for merging with enrollment and neigbhors
sort parcel dos
by parcel: gen merge_id = _n

* save data
order parcel merge_id
sort parcel dos
la da "AO data within LVVWD service district; with enrollment&neighbors merge_id"
save "$intermediate/AOdata_inLVVWD2.dta", replace


* ------------------------------------------------------------------------------
** Prepare intermediate hedonic panel for merger with enrollment and neighbors
* ------------------------------------------------------------------------------
use "$intermediate/AOdata_inLVVWD2.dta", clear

* generate a list of parcels with number of times the parcel sold
by parcel: egen nsale = max(merge_id) // maximum # of sales for a given parcel

keep parcel nsale
duplicates drop
sort parcel
la da "list of parcels with number of times sold: used for merging with enrollment & neighbors"
save "$intermediate/temp_nsales.dta", replace


* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
*			CREATE THE HEDONIC PANEL
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =

* ------------------------------------------------------------------------------
** merge sales data with enrollment data
* ------------------------------------------------------------------------------
use "$intermediate/1tblWSLEnroll_sort.dta", clear 
sort parcel enroll_date

* create a flag for parcels that will have missing values, and thus understate total xeric area
gen byte flg_noArea = 0
replace flg_noArea = 1 if conv_sqft == .
keep if flg_noArea == 1
keep parcel flg_noArea
duplicates drop parcel, force
sort parcel
save "$intermediate/temp_flg.dta", replace

* merge "missing" flag with enrollment data: 
*  this applies the "missing" flag to any parcel that is associated with a missing enrollment date
use "$intermediate/1tblWSLEnroll_sort.dta", clear
sort parcel enroll_date
merge m:1 parcel using "$intermediate/temp_flg.dta"
drop _merge
replace flg_noArea = 0 if flg_noArea == .

* merge data with temp_nsales
sort parcel enroll_date
merge m:1 parcel using "$intermediate/temp_nsales.dta"
keep if _merge==3 // this ensures that I only keep participating parcels that show up in AO
drop _merge

* expand enrollments to match the number of sales by enrollments
expand nsale
bys parcel enroll_date: gen merge_id = _n // sales id for matching with AO sales
order parcel merge_id
sort parcel merge_id enroll_date

* merge preparred enrollment data with AO_sales data
merge m:1 parcel merge_id using "$intermediate/AOdata_inLVVWD2.dta"
gen byte participant = 0
replace participant = 1 if _merge==3 // eventual participating parcel
replace conv_sqft = 0 if participant == 0 // here is my assumption that non-participants are not xeriscaped
replace flg_noArea = 0 if participant == 0
drop _merge

* drop any observations where wg is not single family (SF)
drop if wg=="CI"
drop if wg=="MF"
drop wg


* ------------------------------------------------------------------------------
** create indicators for xeriscape and total areas and create hedonic panel
* ------------------------------------------------------------------------------

*_______________________________________________________________________________
* Create Xeriscape code id and area variables for hedonic regression: 
* Parcels that converted to xeriscape prior sale are considered xeriscaped 
* For multi-conversions, the total sqft converted prior to sale is considered

* create xeriscape indicator
gen xeric = 0 // xeriscape indicator variable
replace xeric = 1 if participant == 1 & dos > enroll_date
la var xeric "indicator for a conversion occuring before sale date"

* sum total converted area prior to sale of parcel (xarea variable)
bys parcel dos: egen xarea = total(conv_sqft) if xeric == 1, missing // total treats . as 0

* generate latest enrollment date (for post june 2009 indicator)
by parcel dos: egen lastenroll = max(enroll_date) if xeric == 1
format lastenroll %tdnn/dd/CCYY
la var lastenroll "last date of enrollment prior to sale"

* drop duplicates to return to the same N as the original AO (net of the 8 obs you dropped for being non-SF)
sort parcel dos enroll_date // ensures that later enrollments occuring after sale date are not kept
duplicates drop parcel dos, force
drop conv_sqft enroll_date

replace xarea = 0 if xarea==. & xeric==0
sort parcel dos

label var xeric "indicator = 1 if parcel converted prior to parcel sale"
label var xarea "total area converted prior to parcel sale"

*_______________________________________________________________________________
* create post 2003 and post june 09 variables 
generate post2003 = 0
replace post2003 = 1 if built >= 2003
la var post2003 "indicator for homes built in/after 2003 which could not have a front lawn"

generate xeric_post2003 = xeric * post2003
generate xarea_post2003 = xarea * post2003

* generate post june 2009 dummy, and interaction with xeriscape
generate post609 = 0
replace post609 = 1 if lastenroll >= date("6/1/2009","MDY") & xeric == 1
la var post609 "indicator for enrollment dates on/after june 1, 2009"

generate xeric_post609 = xeric * post609
generate xarea_post609 = xarea * post609

*_______________________________________________________________________________
* Save temporary hedonic panel and list of parcels converting prior to sale (used for neighbors analysis below)
drop nsale 
// this is missing for all non particpants
drop saletot salecnt resale
// after dropping age<0, these sale counts do not accurately reflect repeat sales
sort parcel dos
la data "Hedonic panel"
save "$intermediate/TEMP_hedonic_panel.dta", replace



* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
*			CREATE THE SPILLOVER HEDONIC PANEL
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 

* ------------------------------------------------------------------------------
* read in and merge adjacent parcels data with SNWA AO data
* ------------------------------------------------------------------------------

use "$intermediate/neighbors.dta", clear
sort parcel enroll_date

* create a flag for parcels that will have missing values, and thus understate total xeric area
gen byte flg_noadjArea = 0
replace flg_noadjArea = 1 if conv_sqft == .
keep if flg_noadjArea == 1
keep parcel flg_noadjArea
duplicates drop parcel, force
sort parcel
save "$intermediate/temp_adjflg.dta", replace

* merge "missing" flag with enrollment data: 
*  this applies the "missing" flag to any parcel that is associated with a missing enrollment date
use "$intermediate/neighbors.dta", clear
sort parcel enroll_date
merge m:1 parcel using "$intermediate/temp_adjflg.dta"
drop _merge
replace flg_noadjArea = 0 if flg_noadjArea == .

* merge data with temp_nsales
sort parcel enroll_date
merge m:1 parcel using "$intermediate/temp_nsales.dta"
keep if _merge==3 // this ensures that I only keep participating parcels that show up in AO
drop _merge

* expand enrollments to match the number of sales by enrollments
expand nsale
bys parcel enroll_date: gen merge_id = _n // sales id for matching with AO sales
order parcel merge_id
sort parcel merge_id enroll_date

* merge preparred enrollment data with AO_sales data
merge m:1 parcel merge_id using "$intermediate/AOdata_inLVVWD2.dta"
gen byte adjacent = 0
replace adjacent = 1 if _merge==3 // eventual participating parcel
replace conv_sqft = 0 if adjacent == 0 // here is my assumption that non-participants are not xeriscaped
replace flg_noadjArea = 0 if adjacent == 0
drop _merge

* remove any parcels that are CI or MF
drop if parcel == "[PARCEL ID]"
drop if parcel == "[PARCEL ID]" 
drop if parcel == "[PARCEL ID]" 
drop if parcel == "[PARCEL ID]" 
drop if parcel == "[PARCEL ID]"

* ------------------------------------------------------------------------------------------------
** create indicators for adjacent xeriscape and total area and create the spillover hedonic panel
* ------------------------------------------------------------------------------------------------

*_______________________________________________________________________________
* Create Xeriscape code id and area variables for hedonic regression: 
* Parcels with an adjacent converting neigbhor are considered in this analysis (but parcels that themselves converted prior to sale are ignored)

* create xeriscape indicator
gen axeric = 0 // xeriscape indicator variable
replace axeric = 1 if adjacent == 1 & dos > enroll_date
la var axeric "indicator for a neighboring conversion occuring before sale date"

* sum total converted area prior to sale of parcel (xarea variable)
bys parcel dos: egen axarea = total(conv_sqft) if axeric == 1, missing // total treats . as 0

* generate latest enrollment date (for post june 2009 indicator)
by parcel dos: egen alastenroll = max(enroll_date) if axeric == 1
format alastenroll %tdnn/dd/CCYY
la var alastenroll "last date of adjacent enrollment prior to sale"

* drop duplicates to return to the same N as the original AO (net of the 8 obs you dropped for being non-SF)
sort parcel dos enroll_date // ensures that later enrollments occuring after sale date are not kept
duplicates drop parcel dos, force
drop conv_sqft enroll_date

replace axarea = 0 if axarea==. & axeric==0
sort parcel dos

label var axeric "1 if adjacent parcel converted prior to parcel sale"
label var axarea "total adjacent area converted prior to parcel sale"

*_______________________________________________________________________________
* create post 2003 and post june 09 variables 
generate apost2003 = 0
replace apost2003 = 1 if built >= 2003
la var apost2003 "indicator for homes built in/after 2003 which could not have a front lawn"

generate axeric_post2003 = axeric * apost2003
generate axarea_post2003 = axarea * apost2003

* generate post june 2009 dummy, and interaction with xeriscape
generate apost609 = 0
replace apost609 = 1 if alastenroll >= date("6/1/2009","MDY") & axeric == 1
la var apost609 "indicator for enrollment dates on/after june 1, 2009"

generate axeric_post609 = axeric * apost609
generate axarea_post609 = axarea * apost609

* Save hedonic panel
rename merge_id amerge_id
sort parcel dos
drop nsale // this is missing for all non particpants
drop saletot salecnt resale // after dropping age<0, these sale counts do not accurately reflect repeat sales
la data "Hedonic spillover panel: non-xeriscaped homes, and non-xeriscaped neighbors of xeriscape"
save "$intermediate/TEMP_hedonicspill_panel.dta", replace


* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
*			MERGE TEMP PANELS AND FINALIZE HEDONIC PANEL
* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
use "$intermediate/TEMP_hedonic_panel.dta", clear
sort parcel dos

merge 1:1 parcel dos using "$intermediate/TEMP_hedonicspill_panel.dta"

* generate quarter dummy
generate mos = month(dos)
generate quart = 0
replace quart = 1 if inrange(mos,1,3)
replace quart = 2 if inrange(mos,4,6)
replace quart = 3 if inrange(mos,7,9)
replace quart = 4 if inrange(mos,10,12)

* define indicators for outliers
foreach var of varlist price housesf age landsf bed fbath hbath poolsf garsf_1 garsf_2 {
quietly summarize `var', detail
gen byte `var'_l090 = 0
gen byte `var'_595 = 0
gen byte `var'_199 = 0
replace `var'_l090 = 1 if inrange(`var', r(p10), r(p90))
replace `var'_595 = 1 if inrange(`var', r(p5), r(p95))
replace `var'_199 = 1 if inrange(`var', r(p1), r(p99))
}

* define numeric parcel id's
destring parcel, gen(parcel2)
format parcel2 %12.0f

* define improved land indicator
gen land_improved = 0
replace land_improved = 1 if pc=="I"

sort parcel dos
la da "hedonic panel including participants and those adjacent to participants"
save "$intermediate/hedonic_panel2.dta", replace

rm "$intermediate/TEMP_hedonic_panel.dta"
rm "$intermediate/TEMP_hedonicspill_panel.dta"

*_______________________________________________________________________________
* merge hedonic panel with particpants who apply but never complete the
* conversion (DNFs)
use "$intermediate/hedonic_panel2.dta", clear
rename _merge _merge0
merge m:1 parcel using "$output/enrollDNF_mod.dta"
drop if _merge==2 // drop DNFs that do not match with any parcel in hedonic sample
la var _merge "from dnf enrollment merger"

* define filter: I want to ignore all non-participant, non-adjacent, non-dnfers (remove these guys from control sample)
gen filter = 0
replace filter = 1 if _merge==1 & adjacent ==0 & participant == 0
la var filter "non-adjacent non-participant non-dnf parcels"

gen dnfenrl = 0
replace dnfenrl = 1 if _merge==3 & participant == 1
la var dnfenrl "dnf-ers that later participated"

* further refine control sample: create filters for inelligibility and rechecks
gen impure = 0
replace impure = 1 if filter==0 & participant == 0 & adjacent == 0 & (recheck == 1 | notel == 1)
la var impure "flag for recheck or ineligible controls"

* save updated panel
sort parcel dos
la da "hedonic panel including participants and those adjacent to participants"
save "$intermediate/hedonic_panel.dta", replace